Etude de marché

Préparation des données

Importation des csv

dispo_al <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/DisponibiliteAlimentaire_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)

pib <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Pib_2017_bis.csv", encoding="UTF-8", sep = ";", header = TRUE)

pop <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Population_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)

#stabilite <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/StabilitePolitique_2017.csv", encoding="UTF-8", sep = ";", header = TRUE)

Dispo_cal <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/Dispo_calorique_2017.csv", encoding = "UTF-8", sep = ";", header = TRUE)

Sta_pol <- read.csv("C:/Users/Nourredine/Desktop/Fichiers/OpenClassRooms/Livrables/P9_bahloul_nourredine/DAN-P9-data/StabilitePolitique_2017.csv", encoding = "UTF-8", sep = ";", header = TRUE)

View(Pib_2017_bis) Importation librairies

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
#library(tidyverse)
#library(tibble)

Première jointure entre PIB & Population

merge1 <- merge(pib, pop, by = c("X.U.FEFF.Pays"), all.x = TRUE)
View(merge1)

Seconde jointure avec la table Dispo_alimentaire

merge2 <- merge(merge1, dispo_al, by = c("X.U.FEFF.Pays"), all.x = TRUE)
View(merge2)

Jointure // merge2 et Dispo_cal

merge2 <- merge(merge2, Dispo_cal, by.x = "X.U.FEFF.Pays", by.y = "X.U.FEFF.Pays", all.x = TRUE)

Jointure // merge, sta_pol

merge2 <- merge(merge2, Sta_pol, by.x = "X.U.FEFF.Pays", by.y = "X.U.FEFF.Country", all.x = TRUE)

Renommage variables

colnames(merge2) <- c("Pays","Pib_dollars_million","pib_habitant_dollars","Population2010","Population2017","Evolution_population","Annee","Production_tonnes","Importations_tonnes","Exportations_tonnes","Dispo_interieure_tonnes","Tx_dependance","Balance_com","Cal_personne_jour","Stabilite_politique")

Vérification types de données

sapply(merge2, class)
##                    Pays     Pib_dollars_million    pib_habitant_dollars 
##                "factor"                "factor"                "factor" 
##          Population2010          Population2017    Evolution_population 
##               "integer"               "integer"                "factor" 
##                   Annee       Production_tonnes     Importations_tonnes 
##               "integer"               "integer"               "integer" 
##     Exportations_tonnes Dispo_interieure_tonnes           Tx_dependance 
##               "integer"               "integer"                "factor" 
##             Balance_com       Cal_personne_jour     Stabilite_politique 
##               "integer"               "integer"               "numeric"

Certaines variables n’ont pas le bon format Conversion Factor en Numeric

merge2$Pib_dollars_million <- as.numeric(sub("," , ".", merge2$Pib_dollars_million))
merge2$pib_habitant_dollars <- as.numeric(sub("," , ".", merge2$pib_habitant_dollars))
merge2$Evolution_population <- as.numeric(sub("," , ".", merge2$Evolution_population))
merge2$Tx_dependance <- as.numeric(sub("," , ".", merge2$Tx_dependance))
sapply(merge2, class)
##                    Pays     Pib_dollars_million    pib_habitant_dollars 
##                "factor"               "numeric"               "numeric" 
##          Population2010          Population2017    Evolution_population 
##               "integer"               "integer"               "numeric" 
##                   Annee       Production_tonnes     Importations_tonnes 
##               "integer"               "integer"               "integer" 
##     Exportations_tonnes Dispo_interieure_tonnes           Tx_dependance 
##               "integer"               "integer"               "numeric" 
##             Balance_com       Cal_personne_jour     Stabilite_politique 
##               "integer"               "integer"               "numeric"

Remplacement NA par 0

suppressWarnings(merge2[is.na(merge2)] <- 0)

*** Reste-t-il des valeurs nulles ?***

any(is.na(merge2))
## [1] FALSE

Redisposition des colonnes

merge2 <- merge2[, c("Pays","Annee","Population2010","Population2017","Evolution_population","Pib_dollars_million","pib_habitant_dollars","Cal_personne_jour","Production_tonnes","Dispo_interieure_tonnes","Importations_tonnes","Exportations_tonnes", "Tx_dependance","Balance_com", "Stabilite_politique")]

Analyse Exploratoire des Données

Dimensions dataset et types de données

str(merge2)
## 'data.frame':    208 obs. of  15 variables:
##  $ Pays                   : Factor w/ 208 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Annee                  : num  2017 2017 2017 2017 2017 ...
##  $ Population2010         : num  29185507 51216964 2948023 35977455 80827002 ...
##  $ Population2017         : num  36296113 57009756 2884169 41389189 82658409 ...
##  $ Evolution_population   : num  0.24 0.11 -0.02 0.15 0.02 -0.09 0.28 0.09 0.08 0.21 ...
##  $ Pib_dollars_million    : num  1.86e+07 3.49e+08 1.30e+07 1.70e+08 3.68e+09 ...
##  $ pib_habitant_dollars   : num  513 6122 4514 4110 44552 ...
##  $ Cal_personne_jour      : num  6 146 55 22 38 0 34 0 233 130 ...
##  $ Production_tonnes      : num  28000 1667000 13000 275000 1514000 ...
##  $ Dispo_interieure_tonnes: num  57000 2118000 47000 277000 1739000 ...
##  $ Importations_tonnes    : num  29000 514000 38000 2000 842000 0 277000 0 7000 722000 ...
##  $ Exportations_tonnes    : num  0 63000 0 0 646000 0 0 0 0 10000 ...
##  $ Tx_dependance          : num  0.51 0.24 0.81 0.01 0.48 0 0.87 0 1 0.5 ...
##  $ Balance_com            : num  0 -451000 -38000 -2000 -196000 0 -277000 0 -7000 -712000 ...
##  $ Stabilite_politique    : num  -2.8 0 0 0 0 0 -0.33 0 0 0 ...

Distribution des données

summary(merge2)
##              Pays         Annee      Population2010     
##  Afghanistan   :  1   Min.   :   0   Min.   :0.000e+00  
##  Afrique du Sud:  1   1st Qu.:2017   1st Qu.:8.549e+05  
##  Albanie       :  1   Median :2017   Median :6.191e+06  
##  Algérie       :  1   Mean   :1619   Mean   :3.307e+07  
##  Allemagne     :  1   3rd Qu.:2017   3rd Qu.:2.120e+07  
##  Andorre       :  1   Max.   :2017   Max.   :1.369e+09  
##  (Other)       :202                                     
##  Population2017      Evolution_population Pib_dollars_million
##  Min.   :0.000e+00   Min.   :-0.20000     Min.   :4.391e+04  
##  1st Qu.:1.166e+06   1st Qu.: 0.03000     1st Qu.:5.324e+06  
##  Median :6.910e+06   Median : 0.08000     Median :2.381e+07  
##  Mean   :3.612e+07   Mean   : 0.09774     Mean   :3.849e+08  
##  3rd Qu.:2.480e+07   3rd Qu.: 0.16250     3rd Qu.:1.752e+08  
##  Max.   :1.421e+09   Max.   : 0.53000     Max.   :1.954e+10  
##                                                              
##  pib_habitant_dollars Cal_personne_jour Production_tonnes 
##  Min.   :   107       Min.   :  0.0     Min.   :       0  
##  1st Qu.:  2025       1st Qu.:  7.0     1st Qu.:    1000  
##  Median :  6169       Median : 40.5     Median :   37500  
##  Mean   : 16429       Mean   : 55.3     Mean   :  582466  
##  3rd Qu.: 19551       3rd Qu.: 86.0     3rd Qu.:  203250  
##  Max.   :171278       Max.   :239.0     Max.   :21914000  
##                                                           
##  Dispo_interieure_tonnes Importations_tonnes Exportations_tonnes
##  Min.   :       0        Min.   :      0     Min.   :      0    
##  1st Qu.:    7000        1st Qu.:      0     1st Qu.:      0    
##  Median :   54500        Median :   9000     Median :      0    
##  Mean   :  556764        Mean   :  67928     Mean   :  82567    
##  3rd Qu.:  253750        3rd Qu.:  51000     3rd Qu.:   6000    
##  Max.   :18266000        Max.   :1069000     Max.   :4223000    
##                                                                 
##  Tx_dependance     Balance_com       Stabilite_politique
##  Min.   :0.0000   Min.   :-1059000   Min.   :-2.80000   
##  1st Qu.:0.0000   1st Qu.:  -17000   1st Qu.: 0.00000   
##  Median :0.1300   Median :       0   Median : 0.00000   
##  Mean   :0.3517   Mean   :   18111   Mean   :-0.05774   
##  3rd Qu.:0.6975   3rd Qu.:       0   3rd Qu.: 0.00000   
##  Max.   :2.2200   Max.   : 4220000   Max.   : 1.33000   
## 

Corrélations

res = cor(merge2[,-1]) # -1 here means we look at all columns except the first column
#res
library(corrplot)
## corrplot 0.92 loaded
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Sélection de quelques catégories pour comparer

library(tidyverse)
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v tibble  3.1.1     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
df2 = merge2[,c("Pays","Dispo_interieure_tonnes","Production_tonnes","Importations_tonnes","Exportations_tonnes")] %>%   # select relevant columns 
  pivot_longer(c("Dispo_interieure_tonnes","Production_tonnes","Importations_tonnes","Exportations_tonnes"),names_to = 'Categorie')
view(df2)
ggplot(data = df2, aes(x=Categorie,y=value, color=Categorie)) + 
  geom_boxplot()+
  scale_color_brewer(palette="Dark2") + 
  geom_jitter(shape=16, position=position_jitter(0.2))+
  labs(title = 'Marché de la volaille dans la monde',
       y='en tonnes',x='catégorie')

Etiqueter les outliers

is_outlier <- function(x) {
   return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}   # define a function to detect outliers
str(df2)
## tibble [832 x 3] (S3: tbl_df/tbl/data.frame)
##  $ Pays     : Factor w/ 208 levels "Afghanistan",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Categorie: chr [1:832] "Dispo_interieure_tonnes" "Production_tonnes" "Importations_tonnes" "Exportations_tonnes" ...
##  $ value    : num [1:832] 57000 28000 29000 0 2118000 ...

Créer une colonne ‘outlier’

df2$Pays = as.character(df2$Pays)
df7 <- df2  %>%
  mutate(is_outlier=ifelse(is_outlier(value), Pays, as.numeric(NA)))
View(df7)
df7$Pays[which(is.na(df7$is_outlier))] <- as.numeric(NA)
View(df7)

Visualisation des outliers

ggplot(data = df7, aes(x=Categorie,y=value, fill=Categorie)) + 
  geom_boxplot(alpha = 0.7,
               outlier.colour='red', 
               outlier.shape=19, 
               outlier.size=3, 
               width = 0.6)+
  geom_text(aes(label = Pays), na.rm = TRUE, hjust = -0.2)+         
  theme_grey() +
  labs(title = 'Répartition des postes avec outliers',
       y='En tonnes',x='',
       caption  = 'Trop de pays ressortent en tant qu\'outliers',
       subtitle = 'Postes principaux d\'échanges') + 
  theme(axis.text=element_text(size=10),
        legend.text = element_text(size = 10), 
        legend.title = element_text(size = 11),
        
        legend.position = 'right', aspect.ratio = 1.4,
        plot.title = element_text(size = 15, face = "bold"),
        plot.subtitle = element_text(size = 10),
        plot.caption = element_text(color = "Red", face = "italic", size = 13)
  )

Suppression de colonnes

merge2 <- subset(merge2, select=-c(Annee, Population2010, Population2017, Production_tonnes, Dispo_interieure_tonnes,  Importations_tonnes, Pib_dollars_million, Cal_personne_jour, Stabilite_politique, Exportations_tonnes))

Création du dendrogramme

Mise de ‘Pays’ en index (il ne faut aucune variable non numérique pour le clustering)

library(tibble)
merge2 <- merge2 %>%
         column_to_rownames('Pays')

Suppression lignes

merge2 <- merge2[!(row.names(merge2) %in% c("Brésil", "États-Unis d'Amérique","Chine, continentale")), ]

Maintenant on réduit les données

merge2_sc <- as.data.frame(scale(merge2))
summary(merge2_sc)
##  Evolution_population pib_habitant_dollars Tx_dependance    
##  Min.   :-2.8559      Min.   :-0.6435      Min.   :-0.7772  
##  1st Qu.:-0.6549      1st Qu.:-0.5672      1st Qu.:-0.7772  
##  Median :-0.1765      Median :-0.4042      Median :-0.4722  
##  Mean   : 0.0000      Mean   : 0.0000      Mean   : 0.0000  
##  3rd Qu.: 0.6848      3rd Qu.: 0.1274      3rd Qu.: 0.7917  
##  Max.   : 4.1298      Max.   : 6.1643      Max.   : 4.0602  
##   Balance_com      
##  Min.   :-5.90705  
##  1st Qu.: 0.01834  
##  Median : 0.11501  
##  Mean   : 0.00000  
##  3rd Qu.: 0.11501  
##  Max.   : 5.63097

détermination du type de distance ; ici: euclidienne

dist_mat <- dist(merge2_sc, method = 'euclidean')

Création du dendrogramme par partition hiérarchique

hclust_avg <- hclust(dist_mat, method = 'average')
#plot(hclust_avg)

Coupage du dendrogramme pour ne garder que les principaux clusters

cut_avg <- cutree(hclust_avg, k = 4)

Colorisation des clusters

suppressPackageStartupMessages(library(dendextend))
avg_dend_obj <- as.dendrogram(hclust_avg)

avg_col_dend <- color_branches(avg_dend_obj, h = 4)
plot(avg_col_dend)

Combien d’informations du dataframe sont attribuées à chaque cluster ?

suppressPackageStartupMessages(library(dplyr))
merge2_cl <- mutate(merge2, cluster = cut_avg)
count(merge2_cl,cluster)
##   cluster   n
## 1       1 195
## 2       2   4
## 3       3   2
## 4       4   4

On passe désormais à la méthode K-means

#install.packages("readxl")
#install.packages("FactoMineR")
#install.packages("factoextra")

Importations librairies nécessaires

library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)

Les données ont déjà été mises à l’échelle avec le dendrogramme

Trouver le nombre optimal de clusters

fviz_nbclust(merge2_sc, kmeans, method = "wss")

Réalisation du K-means grâce au nombre optimal K

#make this example reproducible
set.seed(1)
#perform k-means clustering with k = 4 clusters
km <- kmeans(merge2_sc, centers = 4, nstart = 25)

Liste des clusters par individu

#view results
#km

Déploiement du K-means

#plot results of final k-means model
fviz_cluster(km, data = merge2_sc)

Centroïdes

#find means of each cluster
clusters <- aggregate(merge2, by=list(cluster=km$cluster), mean)
clusters
##   cluster Evolution_population pib_habitant_dollars Tx_dependance
## 1       1           0.05809524             72997.93     0.2766667
## 2       2           0.04200000             11874.22     0.1053333
## 3       3           0.22705882              5418.09     0.2403922
## 4       4           0.08372093             10712.01     1.0597674
##   Balance_com
## 1   -86952.38
## 2    22888.89
## 3   -90098.04
## 4     5000.00

Les pays du cluster 4 disposent beaucoup moins qu’ils ne produisent : ils exportent l’essentiel de leur production Les pays du cluster 3 disposent presque autant qu’il produisent : ils sont auto-suffisants Les pays du cluster 2 disposent plus qu’ils ne produisent : ils ont une réelle dépendance/ demande

#add cluster assigment to original data
final_data <- cbind(merge2, cluster = km$cluster)

#view final data
View(final_data)

Suppression de la variable “cluster” dans le nouveau df “clusters”

clusters = select(clusters, -1)

l’Analyse en Composante Principale

1ère étape : quelle est la part de chaque composante principale sur le plan factoriel ?

merge2.pca <- prcomp(merge2, center = TRUE, scale. = TRUE)

summary(merge2.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4
## Standard deviation     1.1163 0.9965 0.9856 0.8884
## Proportion of Variance 0.3115 0.2483 0.2429 0.1973
## Cumulative Proportion  0.3115 0.5598 0.8027 1.0000

Description de l’ACP

str(merge2.pca)
## List of 5
##  $ sdev    : num [1:4] 1.116 0.997 0.986 0.888
##  $ rotation: num [1:4, 1:4] -0.6754 0.5343 -0.3509 0.3678 0.0177 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 9.84e-02 1.63e+04 3.57e-01 -2.02e+04
##   ..- attr(*, "names")= chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
##  $ scale   : Named num [1:4] 1.04e-01 2.51e+04 4.59e-01 1.76e+05
##   ..- attr(*, "names")= chr [1:4] "Evolution_population" "pib_habitant_dollars" "Tx_dependance" "Balance_com"
##  $ x       : num [1:205, 1:4] -1.325 -1.102 0.132 -0.289 0.646 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:205] "Afghanistan" "Afrique du Sud" "Albanie" "Algérie" ...
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"

Installation librairies nécessaires

#install.packages("Rtools", force = TRUE)
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.3
## Loading required package: usethis
install_github("vqv/ggbiplot")
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 3.5 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation

Mise en forme de l’ACP (représentation par individu)

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(ggbiplot)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: grid
ggbiplot(merge2.pca)

Avec des labels pour chaque pays

ggbiplot(merge2.pca, labels=rownames(merge2))

Pas très parlant. Mieux vaut à la place mettre nos clusters

Repassons par la 1ère étape

clusters.pca <- prcomp(clusters, center = TRUE, scale. = TRUE)

summary(clusters.pca)
## Importance of components:
##                           PC1    PC2    PC3       PC4
## Standard deviation     1.3242 1.2037 0.8930 7.359e-17
## Proportion of Variance 0.4384 0.3622 0.1994 0.000e+00
## Cumulative Proportion  0.4384 0.8006 1.0000 1.000e+00

Diagramme plus clair avec une meilleure représentativité

ggbiplot(clusters.pca)

Idem avec un cercle pour s’assurer que chaque vecteur est bien représenté

ggbiplot(clusters.pca, circle = TRUE, obs.scale = 1, var.scale = 1, labels=rownames(clusters))+
  ggtitle("ACP marché mondial")+
  theme_minimal()+
  theme(legend.position = "right")

Eventuellement faire apparaître un deuxième plan factoriel avec la 3ème composante principale pour distinguer les variables dont l’inertie n’était pas forte dans PC1 et PC2

ggbiplot(clusters.pca, choices = c(2,3), circle = TRUE, obs.scale = 1, var.scale = 1, labels=rownames(clusters))+
  ggtitle("ACP marché mondial")+
  theme_minimal()+
  theme(legend.position = "right")

Corrélations entre nos principaux clusters et les différentes variables

boxplot(merge2$pib_habitant_dollars~km$cluster, ylab = "Pib par hanitant en dollars", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))

boxplot(merge2$Tx_dependance~km$cluster, ylab = "Taux de dépendance", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))

boxplot(merge2$Evolution_population~km$cluster, ylab = "Evolution population", xlab = "Clusters", col = c("lightblue","greenyellow","mediumpurple1","peachpuff2"))

cluster_1 <- final_data %>% filter(cluster==1) 
View(cluster_1)
cluster_2 <- final_data %>% filter(cluster == 2)
View(cluster_2)
cluster_3 <- final_data %>% filter(cluster == 3)
View(cluster_3)
cluster_4 <- final_data %>% filter(cluster == 4)
View(cluster_4)

Création heatmap pour identifier potentiel pays

clusters_sc <- as.data.frame(scale(clusters))
library(dendextend)
# Create dendrogram for rows
mycols <- c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
row_dend <-  clusters_sc %>%
  dist() %>%
  hclust() %>%
  as.dendrogram() %>%
  set("branches_lwd", 1) %>% 
  set("branches_k_color", mycols[1:4], k = 4) 

# Create dendrogram for columns
col_dend <-  clusters_sc %>%
  t() %>%
  dist() %>%
  hclust() %>%
  as.dendrogram() %>%
  set("branches_lwd", 1) %>% 
  set("branches_k_color", mycols[1:4], k = 4)
library(heatmaply)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 3.6.3
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
## 
## ======================
## Welcome to heatmaply version 1.3.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
# Visualize the heatmap
heatmaply(
  clusters_sc,
  #seriate = "none", 
  #row_dend_right = TRUE,
  #plot_method = "plotly",
  Rowv = row_dend,
  Colv = col_dend
)

On “redésindexe” la colonne “Pays”

final_data <- final_data %>%
         rownames_to_column('Pays')

On affiche les pays sélectionnés dans un df

filter_country <- filter(final_data, Pays %in% c('Qatar', 'Oman','Koweït','Émirats arabes unis','Bahreïn','Arabie saoudite')) %>% 
  arrange(desc(Evolution_population))
View(filter_country)

** Comme on ne s’intéresse pas qu’à une seule variable, on obseve les clusters a priori les moins intéressants avec les variables qui nous intéressent**

Part du Pib/habitant au sein du cluster 3

ggplot(data=cluster_3,aes(x=reorder(row.names(cluster_3),Evolution_population),y=Evolution_population)) + 
  geom_bar(stat ='identity',aes(fill=Evolution_population))+
  coord_flip() + 
  theme_grey() + 
  scale_fill_gradient(name="")+
  labs(title = 'Rang Evolution_population du cluster 3',
       y='Taux d\'évolution',x='Pays')+ 
  geom_hline(yintercept = mean(cluster_3$Evolution_population),size = 1, color = 'blue')

ggplot(data=cluster_3,aes(x=reorder(row.names(cluster_3),pib_habitant_dollars),y=pib_habitant_dollars)) + 
  geom_bar(stat ='identity',aes(fill=pib_habitant_dollars))+
  coord_flip() + 
  theme_grey() + 
  scale_fill_gradient(name="")+
  labs(title = 'Rang pays Pib / habitant du cluster 3',
       y='Pib / habitant',x='Pays')+ 
  geom_hline(yintercept = mean(cluster_3$pib_habitant_dollars),size = 1, color = 'blue')

Sources utilisées:

Analyse exploratoire : https://towardsdatascience.com/exploratory-data-analysis-in-r-for-beginners-fe031add7072 Dendrogramme : https://www.datacamp.com/community/tutorials/hierarchical-clustering-R K-means : https://www.statology.org/k-means-clustering-in-r/ ACP : https://www.datacamp.com/community/tutorials/pca-analysis-r Heatmap : https://www.datanovia.com/en/blog/how-to-create-a-beautiful-interactive-heatmap-in-r/